6.3 SDM 模型评估
6.3.1检验的原因和AUC测试的原理
```{r eval =F} 为了评估模型的预测准确性,我们返回测试数据,并使用交叉验证来测试模型。我们的评估将使用一种称为“接收方操作员曲线下的面积(AUC)”的方法。该过程是在预测上设置阈值,以生成不同水平的假阳性率(预测该物种不存在的地方的存在),然后根据假阳性的函数评估真实的阳性率(成功地预测存在)率。该曲线下的面积(从零到一)变化,对模型进行了评估。AUC值为0.5与对存在/不存在的随机猜测相同,而朝向1的值表示我们的预测更可靠。 当然,我们只有状态数据,没有缺勤。幸运的是,通过从区域内的随机点生成“伪缺席”,可以将AUC过程调整为仅存在数据。 此过程有很多警告,但就我们的目的而言,不太可能使我们误入歧途。
#### 6.3.2模型检验-AUC
```{r eval =F}
##AUC测试的一般形式:
## p可以是测试集或者训练集;
mod_eval_train <- dismo::evaluate(p=env_occ_train,a=env_bg,model=mod)
print(mod_eval_train)
##rattler.me是maxent结果;p是测试集数据;
e1 <- evaluate(rattler.me, p=rattlertest, a=bg, x=modelEnv)
plot(e1, 'ROC')
## AUC检测的两种方法
# alternative 1
# extract values
pvtest <- data.frame(extract(predictors, occtest))
avtest <- data.frame(extract(predictors, bg))
e2 <- evaluate(me, p=pvtest, a=avtest)
# alternative 2
# predict to testing points
testp <- predict(me, pvtest)
head(testp)
testa <- predict(me, avtest)
e3 <- evaluate(p=testp, a=testa)
e3
threshold(e3)
plot(e3, 'ROC')
6.3.3 模型检验-Max TSS
maxtss2 <- function(x) {
# x: a directory containing the cross-validated Maxent output
## Notes:
## tss = sens + spec - 1
## sens = 1 - omission
## spec = 1 - FPA
## tss = 1 - omission + 1 - fpa - 1
## = 1 - (omission + fpa)
ff <- list.files(x, 'omission\\.csv$', full.names=TRUE)
max_tss <- sapply(ff, function(f) {
d <- read.csv(f)
i <- which.min(d$Test.omission + d$Fractional.area)
type <- gsub('Corresponding\\.|\\.value', '', colnames(d)[3])
if(!tolower(type) %in% c('logistic', 'cloglog')) {
stop('Expected name of third column to contain either "logistic" or "Cloglog".')
}
c(max_tss=1 - min(d$Test.omission + d$Fractional.area), thr=d[, 3][i])
})
out <- t(max_tss)
rownames(out) <- basename(rownames(out))
list(max_tss=out, max_tss_mean=mean(out[, 'max_tss']),
max_tss_sd=sd(out[, 'max_tss']))
}
maxtss2("./na_AP")
## 参考自:
https://gist.github.com/johnbaums/3ff4c9aa01032ce21a84672c477a6dfb
6.3.4 模型检验-TSS
##注意常规tss,给定阈值之后即可计算。注意这里是阈值一般是采用最大敏感性与特异性之和计算;
## 下面这个函数只需要修改set threshold value
## 另外需要根据上面跑maxent的结果给出来nems1和nems2的值才可以!
nems1 <- c("species_0_backgroundPredictions",paste0("species_",1:9,"_backgroundPredictions"))
nems_1 <- paste0("C:/Users/admin/Desktop/sa_AP/",nems1,".csv")
nems2 <- c("species_0_samplePredictions",paste0("species_",1:9,"_samplePredictions"))
nems_2 <- paste0("C:/Users/admin/Desktop/sa_AP/",nems2,".csv")
calc_tss <- function(y){
backgroundpredictions <- read.csv(nems_1[y])
#we need the last column so will set the number as x
x <- length(backgroundpredictions)
#extract the cloglog/logistic results
backgroundclog <- backgroundpredictions[,x]
#now read in the sample predictions for testing
samplepredictions <- read.csv(nems_2[y])
#we need the last column again of logistic or cloglog predictions so set a second x
x2 <- length(samplepredictions)
#extract the cloglog/logistic results for sample
sampleclog <- samplepredictions[,x2]
#set n the number of pseuabsences used for backgroudn predictions by MaxEnt
n <- 5000
#set threshold value
th <- 0.27
TSS_calculations <- function (sample_clog, prediction_clog, n, th) {
xx <- sum(sample_clog > th)
yy <- sum(prediction_clog > th)
xxx <- sum(sample_clog < th)
yyy <- sum(prediction_clog < th)
ncount <- sum(xx,yy,xxx,yyy)
overallaccuracy <- (xx + yyy)/ncount
sensitivity <- xx / (xx + xxx)
specificity <- yyy / (yy + yyy)
tss <- sensitivity + specificity - 1
#kappa calculations
a <- xx + xxx
b <- xx + yy
c <- yy + yyy
d <- xxx + yyy
e <- a * b
f <- c * d
g <- e + f
h <- g / (ncount * ncount)
hup <- overallaccuracy - h
hdown <- 1 - h
kappa <- hup/hdown
Po <- (xx + yyy) / ncount
Pe <- ((b/ncount) * (a/ncount)) + ((d/ncount) * (c/ncount))
Px1 <- Po - Pe
Px2 <- 1 - Pe
Px3 <- Px1/Px2
tx1 <- xx + yyy
tx2 <- 2 * a * c
tx3 <- a - c
tx4 <- xx - yyy
tx5 <- ncount * (( 2 * a ) - tx4)
tx6 <- ncount * tx1
kappamax <- (tx6 - tx2 - (tx3 * tx4)) / ((tx5 - tx3) - (tx3 * tx4))
cat(" Maxent results for model with/n",a,"test sample predictions/n",c ,"background predicitons/n/n TSS value: ", tss,"/n Overall accuracy: ",overallaccuracy,"/n Sensitivity: ",sensitivity,"/n Specificity: ",specificity,"/n Kappa: ",kappa,"/n Kappa max: ",kappamax)
}
#run the function, the input values are the sampleclog values, then the background clog values, the sample number for the pseudo absences and then threshold value
TSS_calculations(sampleclog,backgroundclog,n,th)
}
6.3.4 模型检验-AIC
## 理解AUC
AUC diff:表征的是测试集与训练集的auc值,两者的差异越小,可以避免一种情况是训练集中auc值很好,但测试集中auc值很低的情况;
model.present <- glm(pb ~ .,data=sdmdata.present)
reduced.sdmdata.present<-subset(sdmdata.present, select=c(-ph,-sstrange))
reduced.present.model<-glm(pb ~ .,data=reduced.sdmdata.present)
summary(reduced.present.model)
Anova(model.present,test="Wald")
## 计算AIC:
# AIC model with all variables------------------------------------------------------------------------------------------------------------------
k <- length(model.present$coefficients)
aic <- (2*k)-(2*logLik(model.present)[[1]])
round(aic)
# AIC model with selected variables------------------------------------------------------------------------------------------------------------------
k1 <- length(reduced.present.model$coefficients)
aic1 <- (2*k1)-(2*logLik(reduced.present.model)[[1]])
round(aic1)
gof1 <- (reduced.present.model$null.deviance-reduced.present.model$deviance)/reduced.present.model$null.deviance
gof1
6.3.4 模型检验-pauc
参见:NicheToolBox